The two data sets used in this analysis are the Madison case data sourced from the Wisconsin DHS and wastewater concentration data produced by the Wisconsin State Laboratory of Hygiene. This wastewater data has entries every couple of days from November 15 2020 to June 24 2021.
“Files Used:”
Z:/COVID-19_WastewaterAnalysis/data/processed/MMSD_Interceptor_Cases_2_7_22.csv
Z:/COVID-19_WastewaterAnalysis/data/processed/LIMSWasteData_02-09-22.csv
| Date | Site | Cases | SevenDayMACases | EpisodeCase | SpecCollectedCase | N1 | BCoV | N2 | PMMoV | GeoMeanN12 | FlowRate | temperature | equiv_sewage_amt | sample_id | cov2_conc | BCoVConc | capacity_mgd | composite_freq | concentration_method | conductivity | county | do | epaid | extraction_method | inhibition_adjust | inhibition_detect | inhibition_method | lod_ref | n1_lod | n1_loq | n1_ntc_amplify | n1_num_no_target_control | n1_num_ntc_amplify | N1Error | n1_sars_cov2_lod | n1_sars_cov2_units | n2_lod | n2_loq | n2_ntc_amplify | n2_num_no_target_control | n2_num_ntc_amplify | N2Error | n2_sars_cov2_lod | n2_sars_cov2_units | pcr_type | ph | Pop | quant_stan_ref | quant_stan_type | sample_collect_time | sample_location | sample_location_specify | sample_matrix | sample_type | state | RepDate | tss | wwtp_comments | zipcode | analytical_comments | unformatted_collectdate | mycustomer | AVG | wt | N1Pure | N2Pure |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2020-09-11 | MMSD-P11 | 10 | 42.85714 | 18 | 22 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| 2020-09-11 | MMSD-P18 | 9 | 21.42857 | 9 | 11 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| 2020-09-11 | MMSD-P2 | 107 | 23.28571 | 92 | 195 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| 2020-09-11 | MMSD-P7 | 2 | 22.85714 | 2 | 1 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| 2020-09-11 | MMSD-P8 | 10 | 41.00000 | 8 | 8 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| 2020-09-12 | MMSD-P11 | 15 | 40.42857 | 14 | 17 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
The case data has a strong weekend effect so for this section we look at a seven day smoothing of cases. The simple display of the data shows the core components of this story. First that wastewater data is very noisy. And that there is a hint of a relationship between the two signals.
FirstImpressionDF <- FullDF%>%
NoNa("N1","Cases")
FirstImpression <- FirstImpressionDF%>%#Removing outliers
ggplot(aes(x=Date))+#Data depends on time
geom_line(aes(y=N1, color="N1",info=N1))+#compares N1 to Cases
geom_line(aes(y=MinMaxFixing(PMMoV,N1), color="PMMoV",info=PMMoV))+
geom_line(aes(y=MinMaxFixing(FlowRate,N1), color="FlowRate",info=FlowRate))+
geom_line(aes(y=MinMaxFixing(BCoV,N1), color="BCoV",info=BCoV))+
#geom_line(aes(y=MinMaxFixing(temperature,N1), color="temperature",info=temperature))+
#geom_line(aes(y=MinMaxFixing(tss,N1), color="tss",info=tss))+
geom_line(aes(y=MinMaxFixing(equiv_sewage_amt,N1), color="equiv_sewage_amt",info=equiv_sewage_amt))+
geom_line(aes(y=MinMaxFixing(SevenDayMACases,N1), color="Seven Day MA Cases",info=Cases))+
geom_line(aes(y=MinMaxFixing(N2,N1), color="N2",info=N2))+
labs(y="N1")
ggplotly(FirstImpression)
#To remove weekend effects we are looking at the 7 day smoothing of cases.
DaySmoothed=21#Very wide smoothing to find where the data strong deviate from trend
ErrorMarkedDF <- FullDF%>%#Remove older Var data that clearly has no relationship to Cases
mutate(EarlyOutlier = ifelse(Date < mdy("11/20/2020"),TRUE,FALSE))#replace old.
#"11/20/2020"
#"1/1/2021"
ErrorMarkedDF$DetectedOutlierN1 <- IdentifyOutliers(ErrorMarkedDF$N1)
ErrorMarkedDF$DetectedOutlierN2 <- IdentifyOutliers(ErrorMarkedDF$N2)
ErrorMarkedDF$DetectedOutlierGeo <- IdentifyOutliers(ErrorMarkedDF$GeoMeanN12)
ErrorMarkedDF$DetectedOutlierPMMOV <- IdentifyOutliers(ErrorMarkedDF$PMMoV)
ErrorMarkedDF$DetectedOutlierBCOv <- IdentifyOutliers(ErrorMarkedDF$BCoV)
ErrorRemovedDF <- ErrorMarkedDF%>%
mutate(PotentialOutlier = Date>mdy("03/31/2021")&mdy("05/12/2021")>Date)%>%
select(-EarlyOutlier)#Removing unneeded calculated columns
ErrorMarkedDF%>%
select(-EarlyOutlier)%>%
#filter(DetectedOutlierN1|DetectedOutlierN2|DetectedOutlierGeo)%>%
mutate(NumberOfOutliers = DetectedOutlierN1+DetectedOutlierN2+DetectedOutlierGeo)%>%
group_by(NumberOfOutliers)%>%
summarize(n())
## # A tibble: 4 x 2
## NumberOfOutliers `n()`
## <int> <int>
## 1 0 7856
## 2 1 93
## 3 2 86
## 4 3 206
ErrorMarkedDF%>%
select(-EarlyOutlier)%>%
summarize(N1 = sum(DetectedOutlierN1), N2 = sum(DetectedOutlierN2), Geo = sum(DetectedOutlierGeo), N12 =sum(DetectedOutlierN1*DetectedOutlierN2), N1Geo =sum(DetectedOutlierN1*DetectedOutlierGeo), N2Geo =sum(DetectedOutlierN2*DetectedOutlierGeo), AllVars =sum(DetectedOutlierN1*DetectedOutlierN2*DetectedOutlierGeo))
## N1 N2 Geo N12 N1Geo N2Geo AllVars
## 1 304 292 287 217 242 245 206
OutlierComp <- ErrorMarkedDF%>%
filter(DetectedOutlierN1|DetectedOutlierN2|DetectedOutlierGeo)%>%
mutate(Name = "",Name = paste(Name,ifelse(DetectedOutlierN1,"N1","")),
Name = paste(Name,ifelse(DetectedOutlierN2,"N2","")),
Name = paste(Name,ifelse(DetectedOutlierGeo,"Geo","")))%>%
ggplot(aes(x=Date))+
geom_point(aes(y=N1,shape="N1",color=Name))+
geom_point(aes(y=N2,shape="N2",color=Name))+
geom_point(aes(y=GeoMeanN12,shape="Geo",color=Name))
ggplotly(OutlierComp)
#,info1=N1,info2=GeoMeanN12
OutlierComp <- ErrorMarkedDF%>%
filter(DetectedOutlierN1|DetectedOutlierPMMOV|DetectedOutlierBCOv)%>%
mutate(Name = "",Name = paste(Name,ifelse(DetectedOutlierN1,"N1","")),
Name = paste(Name,ifelse(DetectedOutlierPMMOV,"PMMoV","")),
Name = paste(Name,ifelse(DetectedOutlierBCOv,"BCoV","")))%>%
ggplot(aes(x=Date))+
geom_point(aes(y=N1,color=Name,info1=PMMoV,info2=BCoV))
ggplotly(OutlierComp)
#ErrorMarkedDF%>%
# select(-EarlyOutlier)%>%
# write.csv(file="OutlierFlagDF")
#wwtp_comments
#analytical_comments
#ErrorRemovedDF%>%
# filter(!is.na(analytical_comments))
CoVarGraphic <- ErrorRemovedDF%>%
ggplot()+#LargeError
aes(y = N1,color=DetectedOutlier,info = Date)
#PotentialOutlier#DetectedOutlier
CoVarGraphicPMMoV <- CoVarGraphic+
geom_point(aes(x = PMMoV))
#Outliers
#26203622#29931423
#149752.0#461781.5
#Potintial outlier
#26463265#30048370
#172041.0#98442.5
CoVarGraphicFlowRate <- CoVarGraphic+
geom_point(aes(x = FlowRate))
CoVarGraphicBCoV <- CoVarGraphic+
geom_point(aes(x = BCoV))
CoVarGraphicequiv_sewage_amt <- CoVarGraphic+#temperature,equiv_sewage_amt
geom_point(aes(x = equiv_sewage_amt))
#PMMoV,FlowRate,BCoV,temperature,equiv_sewage_amt
ggplotly(CoVarGraphicPMMoV)
ggplotly(CoVarGraphicBCoV)
ggplotly(CoVarGraphicFlowRate)
ggplotly(CoVarGraphicequiv_sewage_amt)
ErrorRemovedDF%>%
group_by(DetectedOutlier)%>%
summarize(median(PMMoV,na.rm = TRUE),sd(PMMoV,na.rm = TRUE),
median(N1,na.rm = TRUE),sd(N1,na.rm = TRUE))
AA <- ErrorRemovedDF%>%
ggplot(aes(x=Date,y=equiv_sewage_amt))+
geom_point()
ggplotly(AA)
summary(lm(DetectedOutlier ~ PMMoV ,data = ErrorMarkedDF))
library(rpart)
fit <- rpart(DetectedOutlier ~ PMMoV + N1,
method="class", data=ErrorMarkedDF)
printcp(fit)
plotcp(fit)
summary(fit)
plot(fit)
text(fit, use.n=TRUE, all=TRUE, cex=.5)
CoVarGraphicPMMoV <- CoVarGraphic+
geom_point(aes(x = PMMoV))+
geom_hline(yintercept = 5.61e5)+
geom_vline(xintercept = 2.82e7)
ggplotly(CoVarGraphicPMMoV)
ErrorMarkedDF%>%
ggplot(aes(x = DetectedOutlier, y = log(PMMoV))) +
geom_boxplot()
t.test(PMMoV ~ DetectedOutlier, data = ErrorMarkedDF)
#26203622#29931423
#149752.0#461781.5
LoessFunc <- function(DF,SpanConstant = .01,Var="N1"){
MainDF <- DF
MainDF[[paste0("loess",Var)]] <- loessFit(y=(MainDF[[Var]]),
x=MainDF$Date, #create loess fit of the data
span=SpanConstant, #span of .2 seems to give the best result, not rigorously chosen
iterations=2)$fitted#2 iterations remove some bad patterns
return(MainDF)
}
LoessN1Messurement <- FullDF%>%
group_split(Site) %>%
purrr::map_dfr(LoessFunc)
ErrorCompPlot <- function(ErrorVar="N1",CompVar="N2"){
ErrorPlotDF <- FullDF%>%
group_split(Site) %>%
purrr::map_dfr(LoessFunc,Var = ErrorVar)%>%
filter(!is.na(!!sym(paste0("loess",ErrorVar))))%>%
mutate(TrendError = abs(!!sym(ErrorVar)-!!sym(paste0("loess",ErrorVar))))
NoLogPlot <- ErrorPlotDF%>%
ggplot(aes(y=TrendError,x=!!sym(CompVar),info=Date))+
geom_point()
LogPlot <- ErrorPlotDF%>%
ggplot(aes(y=TrendError,x=!!sym(CompVar),info=Date))+
geom_point()+
scale_y_log10()+
scale_x_log10()
#print(ggplotly(NoLogPlot))
#print(ggplotly(LogPlot))
print((NoLogPlot))
print((LogPlot))
Formula <- as.formula(
paste("TrendError",
paste(CompVar, collapse = " + "),
sep = " ~ "))
fit <- lm(Formula, data = ErrorPlotDF)
print(summary(fit)$coefficients[,4])
print(summary(fit)$r.squared)
return()
}
PlotVarNames <- LoessN1Messurement%>%
select_if(function(col) length(unique(col))>1)%>%
names()
#names(LoessN1Messurement)
for (Name in PlotVarNames){
try(
print(ErrorCompPlot(CompVar = Name))
)
}
## Error in Math.Date(x, base) : log not defined for "Date" objects
## Error in log(x, base) : non-numeric argument to mathematical function
## (Intercept) Cases
## 0.02940312 0.01208053
## [1] 0.00763837
## NULL
## (Intercept) SevenDayMACases
## 1.856535e-01 7.167533e-06
## [1] 0.02219889
## NULL
## (Intercept) EpisodeCase
## 0.2879335219 0.0001109867
## [1] 0.01755592
## NULL
## (Intercept) SpecCollectedCase
## 0.2985190617 0.0003453027
## [1] 0.01539464
## NULL
## (Intercept) N1
## 2.242998e-03 1.801557e-81
## [1] 0.06198631
## NULL
## (Intercept) BCoV
## 0.003978898 0.776838973
## [1] 1.417523e-05
## NULL
## (Intercept) N2
## 6.260085e-01 3.744956e-40
## [1] 0.03126069
## NULL
## (Intercept) PMMoV
## 4.451174e-05 9.699300e-01
## [1] 2.488381e-07
## NULL
## (Intercept) GeoMeanN12
## 9.103091e-02 2.554871e-59
## [1] 0.04513622
## NULL
## (Intercept) FlowRate
## 8.25086e-02 1.15294e-20
## [1] 0.01540143
## NULL
## (Intercept) temperature
## 0.1592601 0.7395512
## [1] 4.829313e-05
## NULL
## (Intercept) equiv_sewage_amt
## 0.001315222 0.083590044
## [1] 0.0005240121
## NULL
## (Intercept) sample_id
## 0.09342473 0.06506079
## [1] 0.0005957142
## NULL
## (Intercept) cov2_conc
## 8.661648e-02 3.853731e-59
## [1] 0.04501469
## NULL
## (Intercept) BCoVConc
## 0.03524225 0.10355942
## [1] 0.000463843
## NULL
## (Intercept) capacity_mgd
## 5.870041e-01 3.402299e-10
## [1] 0.007005544
## NULL
## Error in log(x, base) : non-numeric argument to mathematical function
## Error in log(x, base) : non-numeric argument to mathematical function
## (Intercept) conductivity
## 1.328075e-01 4.317016e-05
## [1] 0.004228313
## NULL
## Error in log(x, base) : non-numeric argument to mathematical function
## Error in log(x, base) : non-numeric argument to mathematical function
## Error in log(x, base) : non-numeric argument to mathematical function
## Error in log(x, base) : non-numeric argument to mathematical function
## Error in log(x, base) : non-numeric argument to mathematical function
## Error in log(x, base) : non-numeric argument to mathematical function
## Error in log(x, base) : non-numeric argument to mathematical function
## Error in log(x, base) : non-numeric argument to mathematical function
## (Intercept) n1_lod
## 0.2367700 0.2765731
## [1] 0.0002072538
## NULL
## (Intercept) n1_loq
## 0.2400919 0.2924249
## [1] 0.0001940499
## NULL
## Error in log(x, base) : non-numeric argument to mathematical function
## (Intercept) n1_num_no_target_control
## 0.2995248 0.7217732
## [1] 2.220059e-05
## NULL
## [1] NaN
## [1] 0
## NULL
## (Intercept) N1Error
## 1.616777e-20 1.593410e-230
## [1] 0.1680539
## NULL
## Error in log(x, base) : non-numeric argument to mathematical function
## (Intercept) n2_lod
## 0.2175018 0.3148867
## [1] 0.0001768324
## NULL
## (Intercept) n2_loq
## 0.2486691 0.2765708
## [1] 0.0002072557
## NULL
## Error in log(x, base) : non-numeric argument to mathematical function
## (Intercept) n2_num_no_target_control
## 0.2995248 0.7217732
## [1] 2.220059e-05
## NULL
## (Intercept) N2Error
## 1.912971e-10 7.804265e-129
## [1] 0.0970591
## NULL
## Error in log(x, base) : non-numeric argument to mathematical function
## Error in log(x, base) : non-numeric argument to mathematical function
## Error in log(x, base) : non-numeric argument to mathematical function
## (Intercept) ph
## 0.0001011119 0.9449674492
## [1] 8.992814e-07
## NULL
## (Intercept) Pop
## 9.156992e-01 6.371909e-11
## [1] 0.00758504
## NULL
## Error in log(x, base) : non-numeric argument to mathematical function
## Error in log(x, base) : non-numeric argument to mathematical function
## Error in log(x, base) : non-numeric argument to mathematical function
## Error in log(x, base) : non-numeric argument to mathematical function
## Error in log(x, base) : non-numeric argument to mathematical function
## Error in log(x, base) : non-numeric argument to mathematical function
## Error in log(x, base) : non-numeric argument to mathematical function
## Error in log(x, base) : non-numeric argument to mathematical function
## Error in log(x, base) : non-numeric argument to mathematical function
## (Intercept) tss
## NaN NaN
## [1] NaN
## NULL
## Error in log(x, base) : non-numeric argument to mathematical function
## Error in log(x, base) : non-numeric argument to mathematical function
## Error in log(x, base) : non-numeric argument to mathematical function
## Error in log(x, base) : non-numeric argument to mathematical function
## (Intercept) mycustomer
## 0.5796107 0.5795374
## [1] 5.374681e-05
## NULL
## (Intercept) AVG
## 9.103090e-02 2.554866e-59
## [1] 0.04513622
## NULL
## (Intercept) wt
## 0.7155899 0.4712583
## [1] 9.086939e-05
## NULL
## (Intercept) N1Pure
## 2.242998e-03 1.801557e-81
## [1] 0.06198631
## NULL
## (Intercept) N2Pure
## 6.229319e-01 1.855587e-41
## [1] 0.03134785
## NULL
## (Intercept) loessN1
## 0.004266055 0.010469474
## [1] 0.001146714
## NULL
ModelDF <- FullDF%>%
group_split(Site) %>%
purrr::map_dfr(LoessFunc,Var = "N1")%>%
filter(!is.na(loessN1))
#fmla <- as.formula(paste0("loessN1 ~ ", paste(PlotVarNames[c(3:56,66)], collapse= " + ")))
#summary(lm(fmla,data = ModelDF))
#names(LoessN1Messurement)